## Set code chunk option defaults
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

## Load packages needed
library(tidyverse)
library(viridis)

## Set theme for plots (only works when you load ggplot2, which can be done using library(ggplot2) OR with library(tidyverse))
theme_set(theme_bw())

scale_color_continuous <- function(...) scale_color_viridis_c(...)
scale_fill_continuous <- function(...) scale_fill_viridis_c(...)

scale_color_discrete <- function(...) scale_color_viridis_d(...)
scale_fill_discrete <- function(...) scale_fill_viridis_d(...)

## Import data and manipulate a bit
framingham <- read_csv("../../data/Framingham/framingham.csv") %>% 
  rename(gender = male) %>% 
  mutate(gender = if_else(gender == 1, 'male', 'female'))

Introduction

About FHS:

Side note: Beaver Dam Eye Study here in Wisconsin is very similar. They are now on their third generation of subjects as well.

Subset of FHS data (from kaggle). Data from 4240 subjects collected over a 10 year period. Outcome: risk factors for developing Coronary Heart Disease (CHD).

## Set options for DT
options(DT.options = list(scrollX = TRUE))

## Print data using DT::datatable
framingham %>% 
  DT::datatable()

Categorical variables

cat_vars <- c('gender', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'TenYearCHD')

ord_vars <- 'education'

cont_vars <- setdiff(colnames(framingham), c(cat_vars, ord_vars))

The categorical variables in this data set: gender, currentSmoker, BPMeds, prevalentStroke, prevalentHyp, diabetes, TenYearCHD.

Ordinal variables: education.

First thing to do: summarize these variables using tables of frequencies.

## Change categorical variables to categorical variables
framingham <- framingham %>% 
  mutate_at(vars({cat_vars}), as.character) %>% 
  mutate(education = fct_explicit_na(as_factor(education), na_level = 'NA'))

## Create tables for all discrete variables
cat_tables <- framingham %>% 
  select({c(cat_vars, ord_vars)}) %>% 
  gather(key = 'Variable') %>% 
  nest(data = value) %>% 
  mutate(tables = map(data, function(x){
    x %>% 
      janitor::tabyl(value) %>% 
      janitor::adorn_totals() %>% 
      select(-contains('valid')) %>% 
      rename(Frequency = n, `Relative Frequency` = percent)
  }))

## Print without relative frequencies
cat_tables$tables %>% 
  map(~.x %>% select(-`Relative Frequency`)) %>% 
  set_names(cat_tables$Variable) %>% 
  pander::pander()
  • gender:

    value Frequency
    female 2420
    male 1820
    Total 4240
  • currentSmoker:

    value Frequency
    0 2145
    1 2095
    Total 4240
  • BPMeds:

    value Frequency
    0 4063
    1 124
    NA 53
    Total 4240
  • prevalentStroke:

    value Frequency
    0 4215
    1 25
    Total 4240
  • prevalentHyp:

    value Frequency
    0 2923
    1 1317
    Total 4240
  • diabetes:

    value Frequency
    0 4131
    1 109
    Total 4240
  • TenYearCHD:

    value Frequency
    0 3596
    1 644
    Total 4240
  • education:

    value Frequency
    1 1720
    2 1253
    3 689
    4 473
    NA 105
    Total 4240

To find relative frequencies: divide frequencies by total. This gives us an opportunity to see if any variables are very skewed (i.e. some categories very rare), assess how much data is missing, etc.

cat_tables$tables %>% 
  set_names(cat_tables$Variable) %>% 
  pander::pander()
  • gender:

    value Frequency Relative Frequency
    female 2420 0.5708
    male 1820 0.4292
    Total 4240 1
  • currentSmoker:

    value Frequency Relative Frequency
    0 2145 0.5059
    1 2095 0.4941
    Total 4240 1
  • BPMeds:

    value Frequency Relative Frequency
    0 4063 0.9583
    1 124 0.02925
    NA 53 0.0125
    Total 4240 1
  • prevalentStroke:

    value Frequency Relative Frequency
    0 4215 0.9941
    1 25 0.005896
    Total 4240 1
  • prevalentHyp:

    value Frequency Relative Frequency
    0 2923 0.6894
    1 1317 0.3106
    Total 4240 1
  • diabetes:

    value Frequency Relative Frequency
    0 4131 0.9743
    1 109 0.02571
    Total 4240 1
  • TenYearCHD:

    value Frequency Relative Frequency
    0 3596 0.8481
    1 644 0.1519
    Total 4240 1
  • education:

    value Frequency Relative Frequency
    1 1720 0.4057
    2 1253 0.2955
    3 689 0.1625
    4 473 0.1116
    NA 105 0.02476
    Total 4240 1

Q: Why do we care about the relative frequencies, when they are so closely related to the frequency counts?

A: For comparisons.

Consider frequency counts of male vs. female smokers. There are more female non-smokers than male non-smokers… but that could just be because there are more female than male subjects!

framingham %>% 
  janitor::tabyl(gender, currentSmoker) %>% 
  janitor::adorn_totals(where = c('row', 'col')) %>% 
  rename(`gender \\\\ currentSmoker` = gender) %>% 
  pander::pander()
gender \ currentSmoker 0 1 Total
female 1431 989 2420
male 714 1106 1820
Total 2145 2095 4240

If we revert to relative frequencies, it becomes easier to compare across groups.

framingham %>% 
  janitor::tabyl(gender, currentSmoker) %>% 
  janitor::adorn_percentages(denominator = 'row') %>% 
  janitor::adorn_totals('col') %>% 
  rename(`gender \\\\ currentSmoker` = gender) %>% 
  pander::pander()
gender \ currentSmoker 0 1 Total
female 0.5913 0.4087 1
male 0.3923 0.6077 1

Graphical Representation

Too many tables, not enough figures!

Best way to display categorical data: bar chart!

bar_charts <- framingham %>% 
  select({c(cat_vars, ord_vars)}) %>% 
  gather(key = 'Variable') %>% 
  ggplot(aes(x = value)) +
    facet_wrap(~Variable, scales = 'free', ncol = 2) +
    geom_bar() + 
    labs(y = 'Frequency')

plotly::ggplotly(bar_charts)

Can easily do the same with relative frequencies:

bar_charts_rel <- framingham %>% 
  select({c(cat_vars, ord_vars)}) %>% 
  gather(key = 'Variable') %>% 
  group_by(Variable) %>% 
  count(value) %>% 
  mutate(y = n/sum(n)) %>% 
  ggplot(aes(x = value, y = y)) +
    facet_wrap(~Variable, scales = 'free_x', ncol = 2) +
    geom_bar(stat = 'identity') + 
    labs(y = 'Relative Frequency', x = '')

plotly::ggplotly(bar_charts_rel)

As you might notice, the bar charts displaying frequencies and relative frequencies look very similar… actually the only change is the scale on the y-axis. Again, the real benefit of using relative frequencies is when comparing between groups.

gender_smoker <- framingham %>% 
  group_by(gender, currentSmoker) %>% 
  summarise(Frequency = n()) %>% 
  group_by(gender) %>% 
  mutate(`Relative Frequency` = Frequency/sum(Frequency)) 

plotly::ggplotly(ggplot(gender_smoker, 
                        aes(x = gender, y = Frequency, fill = currentSmoker)) + 
                   geom_bar(position = 'dodge', stat = 'identity'))
plotly::ggplotly(
  ggplot(gender_smoker, 
         aes(x = gender, y = `Relative Frequency`, fill = currentSmoker)) + 
    geom_bar(position = 'dodge', stat = 'identity') + 
    lims(y = c(0, 1))
)

Maybe education influences ones decision to smoke? From the frequencies it seems that more people in education category 1 smoke than in any other category. Does that mean you’re more likely to smoke if you’re in education category 1?

education_smoker <- framingham %>% 
  group_by(education, currentSmoker) %>% 
  summarise(Frequency = n()) %>% 
  group_by(education) %>% 
  mutate(`Relative Frequency` = Frequency/sum(Frequency)) 

plotly::ggplotly(
  ggplot(education_smoker, 
         aes(x = education, y = Frequency, fill = currentSmoker)) + 
    geom_bar(position = 'dodge', stat = 'identity')
)

Overall, most people are in education category 1, which means that we cannot simply compare frequencies across categories. If we convert to relative frequencies, however, such a comparison is much easier.

plotly::ggplotly(
  ggplot(education_smoker, 
         aes(x = education, y = `Relative Frequency`, fill = currentSmoker)) + 
    geom_bar(position = 'dodge', stat = 'identity') + 
    lims(y = c(0, 1))
)

We see that if you are in categories 1 or 3 you’re more likely to be a non-smoker, whereas if you are in category 2, you’re more likely to be a smoker.

BREAK

Continuous variables

The continuous variables in this data set: age,cigsPerDay,totChol,sysBP,diaBP,BMI,heartRate,glucose.

For continuous variables, we want to get an idea of the location and spread of the observations. Often measured by mean and variance (/standard deviation), or median and IQR/range.

not_missing <- function(x, ...) sum(!is.na(x))
framingham %>%
  select({cont_vars}) %>% 
  gather(key = 'Variable') %>% 
  group_by(Variable) %>% 
  summarise_at(vars(value), 
               list(`n not missing` = not_missing,
                    Mean = mean,
                    Variance = var,
                    SD = sd,
                    Median = median,
                    Min = min, 
                    Max = max,
                    IQR = IQR),
               na.rm = T) %>% 
  pander::pander()
Variable n not missing Mean Variance SD Median Min Max IQR
age 4240 49.58 73.5 8.573 49 32 70 14
BMI 4221 25.8 16.65 4.08 25.4 15.54 56.8 4.97
cigsPerDay 4211 9.006 142.1 11.92 0 0 70 20
diaBP 4240 82.9 141.9 11.91 82 48 142.5 15
glucose 3852 81.96 573.8 23.95 78 40 394 16
heartRate 4239 75.88 144.6 12.03 75 44 143 15
sysBP 4240 132.4 485.5 22.03 128 83.5 295 27
totChol 4190 236.7 1988 44.59 234 107 696 57

Mean and variance are often visualized using a histogram. Below are histograms for all the different variables. Notice what differences in means and variances result in.

histogram_data <- framingham %>% 
  select({cont_vars}) %>% 
  gather(value = 'value', key = 'Variable')

binwidths <- c(5, 5, 15, 10, 5, 2.5, 5, 5) %>% set_names(cont_vars)
  
hists <- htmltools::tagList()

for (var in cont_vars){
  hist <- histogram_data %>% 
    filter(Variable == var) %>% 
    ggplot(aes(x = value)) + 
      geom_histogram(boundary = 0, binwidth = binwidths[var]) + 
      geom_text(data = histogram_data %>% 
                  filter(Variable == var) %>% 
                  summarise(text = paste0('Mean: ', round(mean(value, na.rm = T), digits = 3), '\n',
                                          'SD: ', round(sd(value, na.rm = T), digits = 3)),
                            x = 0.9*max(value, na.rm = T),
                            y = 600),
                aes(x = x, y = y, label = text)) +
      labs(x = var, y = 'Frequency')
  
  hists[[var]] <- plotly::ggplotly(hist)
}

hists

Notice how the histograms display frequencies for each chosen bin. This is also often translated into relative frequencies by… dividing by the total number of observations. As with the categorical variables, relative frequencies makes it easier to compre between groups.

Let’s consider all variables, but broken down by gender:

cont_summaries_by_gender <- framingham %>% 
  select({cont_vars}, gender) %>% 
  gather(value = 'value', key = 'Variable',
         cont_vars) %>% 
  group_by(gender, Variable) %>% 
  summarise_at(vars(value), 
               list(`n not missing` = not_missing,
                    Mean = mean,
                    Variance = stats::var,
                    SD = sd,
                    Median = median,
                    Min = min,
                    Max = max,
                    IQR = IQR),
               na.rm = T) %>% 
  arrange(Variable, gender) %>% 
  mutate_if(is.numeric,
            round, digits = 3)

DT::datatable(cont_summaries_by_gender)

Let’s take a closer look at the totChol variable. How do you relate mean and standard deviation to the histograms?

totChol_hists <- framingham %>% 
  ggplot(aes(x = totChol, fill = gender)) + 
    geom_histogram(aes(y = binwidths['totChol']*..density..),
                   boundary = 0,
                   position = 'identity', binwidth = binwidths['totChol']) + 
    facet_grid(gender~.) + 
    geom_text(data = framingham %>% 
                group_by(gender) %>% 
                summarise(text = paste('Mean: ', round(mean(totChol, na.rm = T), digits = 3), '\n',
                                       'SD: ', round(sd(totChol, na.rm = T), digits = 3)),
                          x = 600, y = 0.1),
              aes(x = x, y = y, label = text)) +
    geom_vline(data = framingham %>% 
                 group_by(gender) %>% 
                 summarise(x = mean(totChol, na.rm = T)),
               aes(xintercept = x), color = 'red', linetype = 'dashed') +
    labs(y = 'Relative Frequencies', caption = 'red dotted line is mean') + 
    theme(legend.position = 'none')

plotly::ggplotly(
  totChol_hists
)

What if we consider cigsPerDay? You still like the mean as the measure of location?

cigsPerDay_hists <- framingham %>% 
  ggplot(aes(x = cigsPerDay, fill = gender)) + 
    geom_histogram(aes(y = ..density..*binwidths['cigsPerDay']),
                   boundary = 0,
                   position = 'identity', binwidth = binwidths['cigsPerDay']) + 
    facet_grid(gender~.) + 
    geom_text(data = framingham %>% 
                group_by(gender) %>% 
                summarise(text = paste('Mean: ', round(mean(cigsPerDay, na.rm = T), digits = 3), '\n',
                                       'SD: ', round(sd(cigsPerDay, na.rm = T), digits = 3)),
                          x = 40, y = 0.5),
              aes(x = x, y = y, label = text)) +
    geom_vline(data = framingham %>% 
                 group_by(gender) %>% 
                 summarise(x = mean(cigsPerDay, na.rm = T)),
               aes(xintercept = x), color = 'red', linetype = 'dashed') +
    labs(y = 'Relative Frequencies', caption = 'red dotted line is mean') + 
    theme(legend.position = 'none')

plotly::ggplotly(cigsPerDay_hists)

Maybe for the male group, but I don’t like it for females. The problem here is that the data is very skewed – there are way more values close to zero than there are large values. In these situations, we often prefer the median and range/IQR. These values are best depicted using the box plot. Here’s the ‘answer key’ to a general boxplot:

ggplot(data = data.frame(y = c(1,2,5,10,10,10,11,13,13,13,15,15,15,15,16,20,24)),
       aes(x = 1, y = y)) + 
  geom_boxplot(width = 0.5) + 
  geom_text(data = data.frame(y = c(1.5,5,10,13,15,20,24),
                               labels = c("outliers", "minimum\n(excluding outliers)",
                                          "lower quartile", "median", "upper quartile", 
                                          "maximum\n(excluding outliers)", "outlier")),
            aes(x = 1.75, label = labels),
            hjust = 0) +
  geom_segment(data = data.frame(y = c(1.5,1.5,5,10,13,15,20,24),
                                 yend = c(1,2,5,10,13,15,20,24)),
               aes(x = 1.65, xend = c(1.1, 1.1, 1.1, 1.35, 1.35, 1.35, 1.1, 1.1),
                   y = y, yend = yend),
               arrow = arrow(length = unit(0.03, "npc")),
               color = 'darkred') +
  theme_minimal() +
  labs(x = '', y = '') +
  scale_x_continuous("", limits = c(0,2.2)) +
  theme(panel.grid = element_blank(),
        axis.text = element_blank(),
        plot.margin=grid::unit(c(0,0,0,0), "mm"))

boxplots_contvars <- framingham %>% 
  select({cont_vars}, gender) %>% 
  gather(value = 'value', key = 'Variable',
         cont_vars)

boxplots <- htmltools::tagList()

for (var in cont_vars){
  tmp <- boxplots_contvars %>% 
    filter(Variable == var) %>% 
    ggplot(aes(x = gender, y = value, fill = gender)) + 
      geom_boxplot(alpha = 0.75) + 
      theme(legend.position = 'none') +
      lims(y = c(0, NA)) +
      labs(x = '', title = var)
  
  boxplots[[var]] <- plotly::ggplotly(tmp)
}

boxplots